{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>Age</th>\n",
       "      <th>level</th>\n",
       "      <th>Email</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>jordon</td>\n",
       "      <td>18</td>\n",
       "      <td>high</td>\n",
       "      <td>jordon@sohu.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MIKE</td>\n",
       "      <td>30</td>\n",
       "      <td>Low</td>\n",
       "      <td>Mike@126.cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Kelvin</td>\n",
       "      <td>45</td>\n",
       "      <td>M</td>\n",
       "      <td>KelvinChai@gmail.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>xiaoLi</td>\n",
       "      <td>23</td>\n",
       "      <td>L</td>\n",
       "      <td>xiaoli@163.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>qiqi</td>\n",
       "      <td>45</td>\n",
       "      <td>middle</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Amei</td>\n",
       "      <td>62</td>\n",
       "      <td>NaN</td>\n",
       "      <td>amei@qq.com</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     name  Age   level                 Email\n",
       "0  jordon   18    high       jordon@sohu.com\n",
       "1    MIKE   30     Low           Mike@126.cn\n",
       "2  Kelvin   45       M  KelvinChai@gmail.com\n",
       "3  xiaoLi   23       L        xiaoli@163.com\n",
       "4    qiqi   45  middle                   NaN\n",
       "5    Amei   62     NaN           amei@qq.com"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "df = pd.DataFrame({'name':['jordon', 'MIKE', 'Kelvin', 'xiaoLi', 'qiqi','Amei'],\n",
    "                   'Age':[18, 30, 45, 23, 45, 62],\n",
    "                   'level':['high','Low','M','L','middle',np.nan],\n",
    "                   'Email':['jordon@sohu.com','Mike@126.cn','KelvinChai@gmail.com','xiaoli@163.com',np.nan,'amei@qq.com']})\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1、文本格式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    jordon\n",
       "1      mike\n",
       "2    kelvin\n",
       "3    xiaoli\n",
       "4      qiqi\n",
       "5      amei\n",
       "Name: name, dtype: object"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 文本变成小写\n",
    "df.name.str.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['name', 'age', 'level', 'email'], dtype='object')"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns.str.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对齐\n",
    "# # 居中对齐，宽度为8，其余用’*’填充\n",
    "# s.str.center(, fillchar='*')\n",
    "# # 左对齐，宽度为8，其余用’*’填充\n",
    "# s.str.ljust(8, fillchar='*')\n",
    "# # 右对齐，宽度为8，其余用’*’填充\n",
    "# s.str.rjust(8, fillchar='*')\n",
    "# # 自定义对齐方式，参数可调整宽度、对齐方向、填充字符\n",
    "# s.str.pad(width=8, side='both',fillchar='*')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    *jordon*\n",
       "1    **MIKE**\n",
       "2    *Kelvin*\n",
       "3    *xiaoLi*\n",
       "4    **qiqi**\n",
       "5    **Amei**\n",
       "Name: name, dtype: object"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 举例\n",
    "df.name.str.center(8, fillchar='*')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    *jordon*\n",
       "1    **MIKE**\n",
       "2    *Kelvin*\n",
       "3    *xiaoLi*\n",
       "4    **qiqi**\n",
       "5    **Amei**\n",
       "Name: name, dtype: object"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.name.str.pad(width=8, side='both', fillchar='*')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         [jordon, sohu.com]\n",
       "1            [Mike, 126.com]\n",
       "2    [KelvinChai, gmail.com]\n",
       "3          [xiaoli, 163.com]\n",
       "4                        NaN\n",
       "5             [amei, qq.com]\n",
       "Name: Email, dtype: object"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Email.str.split('@')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2、文本拆分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>jordon</td>\n",
       "      <td>sohu.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Mike</td>\n",
       "      <td>126.cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>KelvinChai</td>\n",
       "      <td>gmail.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>xiaoli</td>\n",
       "      <td>163.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>amei</td>\n",
       "      <td>qq.com</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            0          1\n",
       "0      jordon   sohu.com\n",
       "1        Mike     126.cn\n",
       "2  KelvinChai  gmail.com\n",
       "3      xiaoli    163.com\n",
       "4         NaN        NaN\n",
       "5        amei     qq.com"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Email.str.split('@',expand=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>jordon</td>\n",
       "      <td>sohu.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Mike</td>\n",
       "      <td>126.cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>KelvinChai</td>\n",
       "      <td>gmail.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>xiaoli</td>\n",
       "      <td>163.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>amei</td>\n",
       "      <td>qq.com</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            0          1\n",
       "0      jordon   sohu.com\n",
       "1        Mike     126.cn\n",
       "2  KelvinChai  gmail.com\n",
       "3      xiaoli    163.com\n",
       "4         NaN        NaN\n",
       "5        amei     qq.com"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Email.str.split('@',expand=True,n=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>jordon</td>\n",
       "      <td>sohu</td>\n",
       "      <td>com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Mike</td>\n",
       "      <td>126</td>\n",
       "      <td>cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>KelvinChai</td>\n",
       "      <td>gmail</td>\n",
       "      <td>com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>xiaoli</td>\n",
       "      <td>163</td>\n",
       "      <td>com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>amei</td>\n",
       "      <td>qq</td>\n",
       "      <td>com</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            0      1    2\n",
       "0      jordon   sohu  com\n",
       "1        Mike    126   cn\n",
       "2  KelvinChai  gmail  com\n",
       "3      xiaoli    163  com\n",
       "4         NaN    NaN  NaN\n",
       "5        amei     qq  com"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Email.str.split('@|\\.',expand=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3、文本替换"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         jordon@sohu.cn\n",
       "1            Mike@126.cn\n",
       "2    KelvinChai@gmail.cn\n",
       "3          xiaoli@163.cn\n",
       "4                    NaN\n",
       "5             amei@qq.cn\n",
       "Name: Email, dtype: object"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Email.str.replace('com','cn')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     xxx@sohu.com\n",
       "1       xxx@126.cn\n",
       "2    xxx@gmail.com\n",
       "3      xxx@163.com\n",
       "4              NaN\n",
       "5       xxx@qq.com\n",
       "Name: Email, dtype: object"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 正则\n",
    "df.Email.str.replace('(.*?)@','xxx@')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         JORDON@sohu.com\n",
       "1             MIKE@126.cn\n",
       "2    KELVINCHAI@gmail.com\n",
       "3          XIAOLI@163.com\n",
       "4                     NaN\n",
       "5             AMEI@qq.com\n",
       "Name: Email, dtype: object"
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Email.str.replace('(.*?)@', lambda x:x.group().upper())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "slice_replace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         jXXrdon@sohu.com\n",
       "1             MXXke@126.cn\n",
       "2    KXXlvinChai@gmail.com\n",
       "3          xXXaoli@163.com\n",
       "4                      NaN\n",
       "5             aXXei@qq.com\n",
       "Name: Email, dtype: object"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Email.str.slice_replace(start=1,stop=2,repl='XX')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "repeat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    jordonjordon\n",
       "1        MIKEMIKE\n",
       "2    KelvinKelvin\n",
       "3    xiaoLixiaoLi\n",
       "4        qiqiqiqi\n",
       "5        AmeiAmei\n",
       "Name: name, dtype: object"
      ]
     },
     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.name.str.repeat(repeats=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4、文本拼接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'jordonMIKEKelvinxiaoLiqiqiAmei'"
      ]
     },
     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.name.str.cat()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'high-Low-M-L-middle-*'"
      ]
     },
     "execution_count": 156,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.level.str.cat(sep='-',na_rep='*')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    jordon*high\n",
       "1       MIKE*Low\n",
       "2       Kelvin*M\n",
       "3       xiaoLi*L\n",
       "4    qiqi*middle\n",
       "5            NaN\n",
       "Name: name, dtype: object"
      ]
     },
     "execution_count": 161,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.name.str.cat(['*']*6).str.cat(df.level)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      jordonhighjordon@sohu.com\n",
       "1             MIKELowMike@126.cn\n",
       "2    KelvinMKelvinChai@gmail.com\n",
       "3          xiaoLiLxiaoli@163.com\n",
       "4                    qiqimiddle*\n",
       "5               Amei*amei@qq.com\n",
       "Name: name, dtype: object"
      ]
     },
     "execution_count": 170,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.name.str.cat([df.level,df.Email],na_rep='*')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5、文本提取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>jordon</td>\n",
       "      <td>sohu</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>KelvinChai</td>\n",
       "      <td>gmail</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>xiaoli</td>\n",
       "      <td>163</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>amei</td>\n",
       "      <td>qq</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            0      1\n",
       "0      jordon   sohu\n",
       "1         NaN    NaN\n",
       "2  KelvinChai  gmail\n",
       "3      xiaoli    163\n",
       "4         NaN    NaN\n",
       "5        amei     qq"
      ]
     },
     "execution_count": 181,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Email.str.extract(pat='(.*?)@(.*).com')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>match</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <th>0</th>\n",
       "      <td>jordon</td>\n",
       "      <td>sohu</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>0</th>\n",
       "      <td>KelvinChai</td>\n",
       "      <td>gmail</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <th>0</th>\n",
       "      <td>xiaoli</td>\n",
       "      <td>163</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <th>0</th>\n",
       "      <td>amei</td>\n",
       "      <td>qq</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  0      1\n",
       "  match                   \n",
       "0 0          jordon   sohu\n",
       "2 0      KelvinChai  gmail\n",
       "3 0          xiaoli    163\n",
       "5 0            amei     qq"
      ]
     },
     "execution_count": 186,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Email.str.extractall(pat='(.*?)@(.*).com')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6、文本查询"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Email</th>\n",
       "      <th>@position</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>jordon@sohu.com</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Mike@126.cn</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>KelvinChai@gmail.com</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>xiaoli@163.com</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>amei@qq.com</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  Email  @position\n",
       "0       jordon@sohu.com        6.0\n",
       "1           Mike@126.cn        4.0\n",
       "2  KelvinChai@gmail.com       10.0\n",
       "3        xiaoli@163.com        6.0\n",
       "4                   NaN        NaN\n",
       "5           amei@qq.com        4.0"
      ]
     },
     "execution_count": 210,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['@position'] = df.Email.str.find('@')\n",
    "df[['Email','@position']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 203,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         [(jordon, sohu)]\n",
       "1                       []\n",
       "2    [(KelvinChai, gmail)]\n",
       "3          [(xiaoli, 163)]\n",
       "4                      NaN\n",
       "5             [(amei, qq)]\n",
       "Name: Email, dtype: object"
      ]
     },
     "execution_count": 203,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Email.str.findall('(.*?)@(.*).com')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7、文本包含"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     True\n",
       "1    False\n",
       "2     True\n",
       "3     True\n",
       "4        *\n",
       "5     True\n",
       "Name: Email, dtype: object"
      ]
     },
     "execution_count": 212,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Email.str.contains('jordon|com',na='*')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 217,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>Age</th>\n",
       "      <th>level</th>\n",
       "      <th>Email</th>\n",
       "      <th>@position</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>jordon</td>\n",
       "      <td>18</td>\n",
       "      <td>high</td>\n",
       "      <td>jordon@sohu.com</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Kelvin</td>\n",
       "      <td>45</td>\n",
       "      <td>M</td>\n",
       "      <td>KelvinChai@gmail.com</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>xiaoLi</td>\n",
       "      <td>23</td>\n",
       "      <td>L</td>\n",
       "      <td>xiaoli@163.com</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Amei</td>\n",
       "      <td>62</td>\n",
       "      <td>NaN</td>\n",
       "      <td>amei@qq.com</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     name  Age level                 Email  @position\n",
       "0  jordon   18  high       jordon@sohu.com        6.0\n",
       "2  Kelvin   45     M  KelvinChai@gmail.com       10.0\n",
       "3  xiaoLi   23     L        xiaoli@163.com        6.0\n",
       "5    Amei   62   NaN           amei@qq.com        4.0"
      ]
     },
     "execution_count": 217,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[df.Email.str.contains('jordon|com',na=False)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8、文本的虚拟变量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 218,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Amei</th>\n",
       "      <th>Kelvin</th>\n",
       "      <th>MIKE</th>\n",
       "      <th>jordon</th>\n",
       "      <th>qiqi</th>\n",
       "      <th>xiaoLi</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Amei  Kelvin  MIKE  jordon  qiqi  xiaoLi\n",
       "0     0       0     0       1     0       0\n",
       "1     0       0     1       0     0       0\n",
       "2     0       1     0       0     0       0\n",
       "3     0       0     0       0     0       1\n",
       "4     0       0     0       0     1       0\n",
       "5     1       0     0       0     0       0"
      ]
     },
     "execution_count": 218,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.name.str.get_dummies()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
